concept categorization (data, how to define on topic models?)
In [1]:
%matplotlib notebook
import itertools
import logging
from functools import partial
import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE
from knub.thesis.util import *
In [8]:
d = np.array([
[1.0, 2.0, 3.1],
[0.5, 1.2, 4.0],
[-1.0, 2.1, 1.0]
])
pca(d, 2)
Out[8]:
In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [4]:
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")
Out[4]:
In [13]:
# Prepare data in long form
df_topics = pnd.read_csv("../models/topic-models/topic.full.fixed-vocabulary.alpha-1-100.256-400.model.ssv",
sep=" ")
df_topics = df_topics.ix[:,-10:]
df_topics.columns = list(range(10))
df_topics["topic"] = df_topics.index
df_topics["topic_name"] = df_topics[0]
df = pnd.melt(df_topics, id_vars=["topic", "topic_name"], var_name="position", value_name="word")
df = df[["word", "topic", "topic_name", "position"]]
df = df.sort_values(by=["topic", "position"]).reset_index(drop=True)
df[df.topic == 0]
Out[13]:
In [40]:
WORD2VEC_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/GoogleNews-vectors-negative300.bin"
GLOVE_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/glove.6B.50d.txt"
CBOW_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/embedding.model.cbow"
SKIP_GRAM_VECTOR_FILE = "/home/knub/Repositories/master-thesis/models/word-embeddings/embedding.model.skip-gram"
#vectors_glove = gensim.models.Word2Vec.load_word2vec_format(GLOVE_VECTOR_FILE, binary=False)
#vectors_skip = gensim.models.Word2Vec.load_word2vec_format(SKIP_GRAM_VECTOR_FILE, binary=True)
#vectors_cbow = gensim.models.Word2Vec.load_word2vec_format(CBOW_VECTOR_FILE, binary=True)
vectors_word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_VECTOR_FILE, binary=True)
vectors_default = vectors_word2vec
In [42]:
def get_data_frame_from_word_vectors(df_param, vectors):
df_param = df_param[df_param["word"].apply(lambda word: word in vectors)]
df_param["embeddings"] = df_param["word"].apply(lambda word: vectors[word])
return df_param
df = get_data_frame_from_word_vectors(df.copy(), vectors_default)
df[df.topic == 0]
Out[42]:
In [43]:
# financial, muslim, teams in sport, atom physics, math
nice_topics = [5, 117, 158, 164, 171]
nice_topics = [0, 7, 236]
df_part = df[df.topic.apply(lambda topic: topic in nice_topics)].copy()
# Show topics of interest
df_tmp = pnd.DataFrame(df_part.groupby("topic")["word"].apply(lambda l: l.tolist()).tolist())
df_tmp.index = nice_topics
df_tmp
Out[43]:
In [45]:
def plot_topics_in_embedding_space(reduction_method, df_param):
embeddings = np.array(df_param["embeddings"].tolist())
X = reduction_method(embeddings)
df_tmp = df_param.copy()
df_tmp["x"] = X[:,0]
df_tmp["y"] = X[:,1]
df_tmp = df_tmp[df_tmp.topic.apply(lambda topic: topic in nice_topics)]
colors = {0: "red", 7: "blue", 236: "green", 164: "yellow", 171: "black"}
plt.figure(figsize=(12, 8))
plt.scatter(df_tmp.x, df_tmp.y, c=df_tmp.topic.apply(lambda topic: colors[topic]), s=80)
ylim = plt.gca().get_ylim()
step = (ylim[1] - ylim[0]) / 100
for index, row in df_tmp.iterrows():
plt.text(row.x, row.y - step, row.word, horizontalalignment='center', verticalalignment='top')
In [46]:
#plot_topics_in_embedding_space(pca, df)
In [47]:
plot_topics_in_embedding_space(pca, df_part) # third dimensions
In [ ]:
#plot_topics_in_embedding_space(tsne, df)
In [22]:
plot_topics_in_embedding_space(tsne_with_init_pca, df)
Topics from the topic model do not seem to be in similar positions in the vector space, in general.
In [48]:
def average_pairwise_similarity(words, vectors):
word_pairs = itertools.permutations(words, 2)
similarities = [vectors.similarity(word1, word2) for word1, word2 in word_pairs if word1 < word2]
return np.mean(similarities)
def average_top_similarity(words, vectors):
word_pairs = itertools.permutations(words, 2)
similarities = [(word1, vectors.similarity(word1, word2)) for word1, word2 in word_pairs]
max_similarities = [max([s for w, s in l]) for _, l in itertools.groupby(similarities, lambda s: s[0])]
return np.mean(max_similarities)
In [49]:
topic_lengths = list(range(2, 11))
def calculate_similarities_for_topic(df_topic, sim_function, vectors):
words_in_topic = df_topic["word"].tolist()
average_similarities = [sim_function(words_in_topic[:topic_length], vectors)
for topic_length in topic_lengths]
return pnd.Series(average_similarities)
def calculate_similarity_matrix(sim_function, vectors):
def partial_function(df_topic):
return calculate_similarities_for_topic(df_topic, sim_function, vectors)
df_similarities = df.groupby("topic").apply(partial_function)
df_similarities.columns = ["%s-words" % i for i in topic_lengths]
return df_similarities
In [50]:
df_similarities = calculate_similarity_matrix(average_pairwise_similarity, vectors_default)
df_similarities.mean()
Out[50]:
In [51]:
means = df_similarities.mean().tolist()
plt.figure(figsize=(12, 8))
plt.scatter(topic_lengths, means, s=80)
plt.title("Avg. word similarity (cosine similarity in WE space) of topics up to the nth word")
plt.xlim(0, 11)
plt.xticks(list(range(1, 12)))
#plt.ylim((0, 0.35))
plt.xlabel("topic length")
plt.ylabel("average similarity")
Out[51]:
For comparison, here are a few standard similarities:
king-prince: {{vectors_default.similarity("king", "prince")}} king-queen: {{vectors_default.similarity("king", "queen")}} topic-topics: {{vectors_default.similarity("topic", "topics")}} buy-purchase: {{vectors_default.similarity("buy", "purchase")}}
In [52]:
def show_highest_similar_topics(topic_length, nr_topics=3):
column = "%s-words" % topic_length
df_top = df_similarities.sort_values(by=column, ascending=False)[:nr_topics]
return df_top.join(df_topics)[[column] + list(range(topic_length))]
In [53]:
show_highest_similar_topics(3)
Out[53]:
In [54]:
show_highest_similar_topics(6)
Out[54]:
In [55]:
show_highest_similar_topics(10)
Out[55]: